
##############################
###  SET UP ENVIRONMENT    ###
##############################

rm(list=ls())

library(logr)
log_open()

#  Change this to match your file system
setwd("C:/Users/tuq69844/Dropbox/anti hate treatments")

#  Packages
library(tidyverse)
library(haven)
library(readxl)
library(fastDummies)

#  Helpers

source("scripts/style.R")
remove_spaces = function(x) gsub(" ", "", .)

#  Load raw data, adopt new variable names (see codebook)

d = read_spss("survey_data/GWAS0042_w2_OUTPUT.sav") %>% as.matrix %>% as_tibble
codebook = read_xlsx("survey_data/codebook.xlsx")

names(d) = codebook$variable[match(names(d), codebook$yougov)]
names(d)

#  Split off open-ended responses

d %>% filter(z == 1) %>% select(caseid, outgroup_exp, know_jew, yes_kn_jew_antisem, no_kn_jew_antisem) %>%
  write_csv("replication_file/data_survey_openends.csv")

#  Add incompletes, remove respondents who didn't reach treatment

incompletes = read_spss("replication_file/data_survey_breakoffs_raw.sav") %>% as.matrix %>% as_tibble %>% rename(caseid = caseid_new)

names(incompletes) = codebook$variable[match(names(incompletes), codebook$yougov)]

d = bind_rows(
  d %>% mutate(complete = 1), 
  incompletes %>% mutate(complete = 0)
)

d = d %>% filter(!is.na(z))

#  Export variables for browsing cleaning

write_csv(d %>% select(caseid, starttime_w1, complete), "replication_file/data_survey_time.csv")

###################
###  CLEANING   ###
###################

#  Clean DVs

d = d %>%
  
  #  emotional response: four-point scales to 0-1, missing = -1, skipped = 8
  mutate_at(
    vars(starts_with("emo_")),
    function(x){
      x[x==8|x==-1] = NA_real_
      (as.numeric(x)-1)/3
    }
  ) %>%
  
  #  discrimination: four-point scales to 0-1, missing = -1, skipped = 8, reverse so that high values = perceive more discrim
  mutate_at(
    vars(starts_with("discrim")),
    function(x){
      x[x==8|x==-1] = NA_real_
      1 - (as.numeric(x)-1)/3
    }
  ) %>%
  
  #  thermometers to 0-1
  mutate_at(
    vars(starts_with("therm_")),
    function(x) as.numeric(gsub(" ", "", x))/100
  ) %>%
  
  #  policy: do enough: 5-point scale with DK coded as 8
  mutate_at(
    vars(starts_with("policy_doenough")),
    function(x) recode(gsub(" ", "", x), `1`=0, `2`=1/4, `8`=1/2, `3`=3/4, `4`=1)
  ) %>%
  
  #  policy: hate crime: five-point scales to 0-1, missing = -1, skipped = 8
  mutate_at(
    vars(starts_with("policy_hatecrime_")),
    function(x){
      x[x==8|x==-1] = NA_real_
      1 - (as.numeric(x)-1)/4
    }
  ) %>%
  
  #  stereotypes: five-point scales to 0-1, missing = -1, skipped = 8, reverse so that high values = endorse stereotypes
  mutate_at(
    vars(starts_with("stereo_")),
    function(x){
      x[x==8|x==-1] = NA_real_
      1 - (as.numeric(x)-1)/4
    }
  ) %>%
  
  mutate(
    stereo_honest_w1 = 1-stereo_honest_w1,
    stereo_honest_w2 = 1-stereo_honest_w2
  ) %>%
  mutate_at(
    #  contact with jews, missing = -1, DK = 9 (why not be consistent across Qs?!)
    vars(starts_with("contact_jew")),
  
    function(x){
      x = as.numeric(x)
      x[x %in% c(-1, 9)] = NA_real_
      (x-1)/5
    }
  ) %>%
  
  #  mean-impute DKs for contact with Jews, no natural midpoint
  mutate(
    contact_jew = ifelse(is.na(contact_jew), mean(contact_jew, na.rm=T), contact_jew)
  ) %>%
  group_by(z) %>%
  mutate(
    contact_jew_w2 = ifelse(is.na(contact_jew_w2), mean(contact_jew_w2, na.rm=T), contact_jew)
  ) %>%
  group_by()

#  DV indices

d = d %>%
  mutate(
    policy_index_w1 = (policy_doenough_w1 + policy_hatecrime_w1) / 2,
    policy_index_w2 = (policy_doenough_w2 + policy_hatecrime_w2) / 2,
    
    stereo_index_w1 = (stereo_wallstreet_w1 + stereo_intlmkt_w1 + stereo_powerUS_w1 + stereo_powerglobal_w1 + stereo_entertainment_w1 + stereo_holocaust_w1 + stereo_immigrants_w1 + stereo_headofthings_w1 + stereo_honest_w1 + stereo_ownkind_w1 + stereo_shrewd_w1 + stereo_shady_w1 + stereo_thinkbetter_w1) / 13,
    stereo_index_w2 = (stereo_wallstreet_w2 + stereo_intlmkt_w2 + stereo_powerUS_w2 + stereo_powerglobal_w2 + stereo_entertainment_w2 + stereo_holocaust_w2 + stereo_immigrants_w2 + stereo_headofthings_w2 + stereo_honest_w2 + stereo_ownkind_w2 + stereo_shrewd_w2 + stereo_shady_w2 + stereo_thinkbetter_w2) / 13
  )

#  Clean covariates

d = d %>%
  mutate(z = gsub(" ", "", z)) %>%
  mutate_at(
    vars(employ, marital, votepref_2020, voted_2020, newsint, votepref_2016, votereg, child18, urbanicity),
    function(x) as.character(parse_number(x))
  )

d$relig_whichprotestant[is.na(d$relig_whichprotestant)]="Not asked"

d = d %>%
  mutate_at(
    vars(starts_with("internet"), starts_with("conspir"), starts_with("troll")), 
    function(x) as.numeric(gsub(" ", "", x))
  ) %>%
  mutate(
    caseid = as.numeric(gsub(" ", "", caseid)),
    z = as.numeric(z==1),
    
    age = 2021 - as.numeric(age),
    female = as.numeric(gender==2),
    race = recode_factor(race, `1`="White", `2`="Black", `3`="Hispanic", `4`="Asian", .default="Other"),
    educ = recode(educ, `1`="Less than HS", `2`="HS grad", `3`="Some college", `4`="2-year", `5`="4-year", `6`="Graduate"),
    marital = recode(marital, `1`="Married", `2`="Separated", `3`="Divorced", `4`="Widowed", `5`="Never married", `6`="Partntership"),
    child18 = as.numeric(child18==1),
    employ = recode(employ, `1`="Full-time", `2`="Part-time", `3`="Unemployed", `4`="Laid off", `5`="Retired", `6`="Disabled", `7`="Homemaker", `8`="Student", `9`="Other"),
    urbanicity = recode(urbanicity, `1`="City", `2`="Suburb", `3`="Town", `4`="Rural", `5`="Other"),
    attrit = as.numeric(is.na(therm_asian_w2) & is.na(therm_jew_w2) & is.na(therm_black_w2) & is.na(discrim_jew_w2)),
    attrit = ifelse(complete==0, NA_integer_, attrit),
    
    #  recode to median of group
    faminc = #recode(gsub(" ", "", faminc), `1`=5, `2`=15, `3`=25, `4`=35, `5`=45, `6`=55, `7`=65, `8`=75, `9`=90, `10`=110, `11`=135, `12`=175, `13`=225, `14`=300, `15`=425, `16`=500, `97`=NA_real_),
      case_when(
        faminc <= 5 ~ "5ok or less",
        faminc <= 9 ~ "50-100k",
        faminc <= 16 ~ "100k or more",
        TRUE ~ "Missing"
      ),
      
    #  forget about recoding this unless necessary
    state = factor(state),
    
    votereg = as.numeric(votereg==1),
    voted_2020 = as.numeric(voted_2020==1),
    votepref_2020 = recode(votepref_2020, `1`="Biden", `2`="Trump", `6`="Did not vote", .default="Other"),
    votepref_2016 = recode(votepref_2016, `1`="Clinton", `2`="Trump", `7`="Did not vote", .default="Other"),
    
    #  if ever admit not voting, assume didn't vote
    voted_2020 = ifelse(votepref_2020=="Did not vote", 0, voted_2020),
    votepref_2020 = ifelse(voted_2020==0, "Did not vote", votepref_2020),
    
    #  recode not sure to moderate
    ideo5 = ifelse(ideo5==6, 3, as.numeric(ideo5)),
    
    newsint = recode(newsint, `1`=0, `2`=1/4, `7`=1/2, `3`=3/4, `4`=1),
    
    relig_bornagain = as.numeric(relig_bornagain==1),
    relig_imp = 1 - (as.numeric(relig_imp)-1)/3,
    
    #  assume those who DK church attendance + prayer frequency are seldom, then recode to 0-1
    relig_attend = 1 - (recode(as.numeric(relig_attend), `7`=5)-1)/5,
    relig_pray = 1 - (recode(as.numeric(relig_pray), `8`=6)-1)/6,
    
    relig_which = recode(gsub(" ", "", relig_which), `1`="Protestant", `2`="Catholic", `5`="Jewish", `9`="Atheist", `10`="Agnostic", `11`="Nothing", .default="Other"),
    relig_whichprotestant = recode(gsub(" ", "", relig_whichprotestant), `1`="Baptist", `2`="Methodist", `4`="Lutheran", `5`="Pentecostal", `6`="Episcopalian", .default="Other"),
    
    crt_ball = as.numeric(gsub(" ", "", crt_ball)==5),
    crt_pond = as.numeric(gsub(" ", "", crt_pond)==47),
    crt_widget = as.numeric(gsub(" ", "", crt_widget)==5),
    crt_total = (crt_ball + crt_pond + crt_widget) / 3,
    crt_NA = as.numeric(is.na(crt_total)),
    
    knowl_sen = as.numeric(gsub(" ", "", knowl_sen)==6),
    knowl_house = as.numeric(knowl_house==1),
    knowl_foreignaid = as.numeric(knowl_foreignaid==1),
    knowl_roberts = as.numeric(gsub(" ", "", knowl_roberts)==3),
    knowl_total = (knowl_sen + knowl_house + knowl_foreignaid + knowl_roberts) / 4,
    
    troll_height = as.numeric(troll_height_feet)*12 + as.numeric(troll_height_in),
    troll_sleep = as.numeric(troll_sleep),
    troll_smoke = as.numeric(troll_smoke),
    
    troll_total = (troll_smoke < 8 & troll_smoke >= 0) + (troll_limb == 1) + (troll_sleep > 11) + (troll_height < 57 | troll_height > 70)*female + (troll_height < 61 | troll_height > 76)*(1-female),
    
    troll = as.numeric(troll_total >= 2),
    
    conspir_index = (conspir_murder + conspir_smallGroup + conspir_unknowns) / 15,
    
    empathy_otherguy = ifelse(empathy_otherguy==8, NA_real_, (as.numeric(gsub(" ", "", empathy_otherguy))-1)/4),
    empathy_imagine = ifelse(empathy_imagine==8, NA_real_, (as.numeric(gsub(" ", "", empathy_imagine))-1)/4),
    
    empathy_index = (empathy_otherguy + empathy_imagine) / 2,
    
    pid7 = as.numeric(pid7),
    pid7 = ifelse(pid7 == 8, 4, pid7),    
    pid7 = pid7-1,
    
    pid3 = as.numeric(pid3),
    pid3 = ifelse(pid3 %in% 4:5, 2, pid3),
    
    internet_more_mobile = as.numeric(internet_mobile < internet_desktop),
    
    starttime_w1 = as_datetime(starttime_w1)
    
  ) %>%
  select(
    -starts_with("troll_"), -race_other
  )

c(codebook$variable[codebook$purpose == "covariate"], "crt_total", "conspir_index", "troll_total", "knowl_total")

#  Add indicators for categorical covariates

dummy_vars = c("educ", "faminc", "race", "marital", "urbanicity", "employ", "state", "relig_which", "relig_whichprotestant", "votepref_2020", "votepref_2016")

d_controls = dummy_cols(d, dummy_vars)

names(d_controls) = gsub(" |-", "", names(d_controls))

d_controls = d_controls[,-which(names(d_controls) %in% dummy_vars)]

covariates = c(
  codebook$variable[codebook$purpose == "covariate"], 
  names(d_controls)[grep(paste(dummy_vars, collapse="|"), names(d_controls))],
  "crt_total", "empathy_index", "conspir_index", "troll_total", "knowl_total", "internet_more_mobile", "attrit") %>%
  unique

covariates = covariates[covariates %in% names(d_controls)]

d_controls = d_controls[c("caseid", covariates)]

d_controls = mutate_all(d_controls, as.numeric)

########################
###   STACK BY DV    ###
########################

dv = d[,c("caseid", "z", codebook$variable[codebook$purpose=="dv"])]

dv = dv %>%
  gather(variable, value, -caseid, -z)

dv = dv %>% mutate(
  var_order = 
    -5*grepl("emo", variable) - 
    4*grepl("therm", variable) - 
    3*grepl("discrim", variable) - 
    2*grepl("policy", variable) -
    1*grepl("stereo", variable)
) %>%
  arrange(var_order)

dv = left_join(dv, codebook %>% select(variable, label:hypothesis_label))

dv = dv %>%
  mutate(
    wave = gsub(".+_w", "", variable),
    value = as.numeric(value),
    var = gsub("_w.$", "", variable)
  )

##############################
###  REMOVE SUBJECTS WHO   ###
###  NEVER REACHED TREAT   ###
##############################

d_covar = left_join(
  d_covar,
  dv %>% group_by(caseid) %>% summarize(dvs_missing = sum(is.na(value))) %>% mutate(all_missing = as.numeric(dvs_missing == 40))
)

never_reached_treatment = d_covar$caseid[which(d_covar$all_missing==1 & is.na(d_covar$internet_desktop))]

dv = dv %>% filter(!(caseid %in% never_reached_treatment))

d_covar = d_covar %>% filter(!(caseid %in% never_reached_treatment)) %>% select(-all_missing, -dvs_missing)

d = d %>% filter(!(caseid %in% never_reached_treatment)) 

##############################
###  SPLIT OFF COVARIATES  ###
##############################

#  Drop covariates that are functions of other covariates

d_covar = left_join(
  d %>% select(caseid:z, starttime_w1, troll, complete),
  d_controls
)

d_covar = d_covar %>% select(
  -contains("_Other", ignore.case = F),
  -faminc_Missing,
  -faminc_5okorless,
  -employ_Unemployed,
  -employ_Student,
  -marital_Separated,
  -contains("whichProtestant"),
  -crt_ball,
  -crt_widget,
  -crt_pond, 
  -knowl_roberts,
  -knowl_foreignaid,
  -knowl_sen,
  -knowl_house,
  -empathy_otherguy,
  -empathy_imagine,
  -voted_2020,
  -contains("state"),
  -contains("weight"),
  -conspir_murder,
  -conspir_smallGroup,
  -conspir_unknowns,
  -pid3,
  -internet_desktop, -internet_mobile    #  prereg specifies focus on respondents who use mobile more
)

#  Restore initial file structure (prior to receipt of incomplets)
#  so that analysis script still works

complete_ids = d_covar %>% filter(complete == 1) %>% .$caseid

d_break = d_covar %>% filter(!(caseid %in% complete_ids)) %>% select(-complete)

d_covar_out = d_covar %>% filter(caseid %in% complete_ids) %>% select(-complete)
 
d_dv_out = dv %>% filter(caseid %in% complete_ids)

#  Export

write_rds(d_covar_out, "replication_file/data_survey_completes_clean.rds")
write_rds(d_break,     "replication_file/data_survey_breakoffs_clean.rds")
write_rds(d_dv_out,    "replication_file/data_survey_DVs_clean.rds")

log_close()